# -*- coding: utf-8 -*-
import scrapy
from ipproxy.items import IpproxyItem


class GuobaojiaSpider(scrapy.Spider):
    name = 'guobaojia'
    allowed_domains = ['www.goubanjia.com']

    def __init__(self, page_count=3, *args, **kwargs):
        super(GuobaojiaSpider, self).__init__(*args, **kwargs)
        self.start_urls = []
        for i in range(1, page_count+1):
            self.start_urls.append("http://www.goubanjia.com/free/gngn/index{0}.shtml".format(i))

    def parse(self, response):
        all_trs = response.xpath('//*[@id="list"]/table//tr')
        for tr in all_trs[1:]:
            row = IpproxyItem()
            ipinfo = tr.xpath('td')
            row['ip'] = ''.join(ipinfo[0].xpath('*[@style!="display: none"]/text()').extract())
            row['port'] = ipinfo[0].xpath('*/text()').extract()[-1]
            row['proxy_type'] = ipinfo[2].xpath('a/text()').extract_first()
            row['anonymous'] = ipinfo[1].xpath('a/text()').extract_first()
            row['country'] = ' '.join(ipinfo[3].xpath('a/text()').extract())
            row['speed'] = float(ipinfo[5].xpath('text()').extract_first().replace(' 秒', ''))
            row['checked_time'] = None  # ipinfo[6]
            row['proxy_name'] = self.name
            yield row
        pass
